This notebook systematically performs weak signal analysis on BrewDog locations in the UK in order to create a composite measure capable of predicting which LSOAs in the UK would support a new BrewDog location.
In this section we setup the workspace we will use within this notebook - the python libraries, and the local directory contianing the data inputs and outputs produced over the course of this analysis.
import datalab
from tensorflow.python.lib.io import file_io
import json
import math
import matplotlib.pyplot as plot
import numpy as np
import pandas as pd
import random
import os
import seaborn as sns
import sklearn.metrics as metrics
from sklearn.decomposition import PCA
from google.datalab import Context
import google.datalab.storage as storage
The local development workspace will be in
/content/datalab/workspace/brewdog.
workspace_path = '/content/datalab/workspace/brewdog'
!mkdir -p {workspace_path}
If you have previously run this notebook, and want to start from scratch, then run the next cell to delete and create the workspace directory.
!rm -rf {workspace_path} && mkdir {workspace_path}
In order to begin our anlysis, we need to load the UK Master Dataset that we produced using our virtaulized data layer (DataPrep) from Cloud Storage.
!gsutil -q cp gs://princeton-thesis-data/uk-master/uk-master-data.csv {workspace_path}/data/uk-master-data.csv
!ls -l {workspace_path}/data
Next, create Pandas dataframe from the dataseat we just loaded in.
df_data = pd.read_csv(os.path.join(workspace_path, 'data/uk-master-data.csv'), dtype=str)
print '%d rows' % len(df_data)
df_data.describe()
In order to perform weak signal analysis, we must standardize, vectorize, and normalize the dataset. This section carries out each of those steps. We also generate a random artificial dataset to use to verify that the correlations we discover are meaningful.
# Standardize to Percentage of Population Indicators
def standardize_data(df):
column_names = list(df)
population_count_columns = [col for col in column_names if 'population_count' in col]
total_population_column_name = 'Area_Population_Density_All_usual_residents_population_count'
total_population_column = df[total_population_column_name]
total_population_index = population_count_columns.index(total_population_column_name)
del population_count_columns[total_population_index] # remove total population count from columns
for col in population_count_columns:
new_name = col.replace('population_count', 'percentage_of_population')
df[new_name] = df[col] / total_population_column
del df[col]
return df
# Vectorize data based on analysis of indicators
def vectorize_data(df):
# uncomment indicators to add to negative vectorization list
negative_vector_columns = [
# 'Economic_Activity_Economically_active_Employee_Part_time_percentage_of_population',
# 'Economic_Activity_Economically_active_Employee_Full_time_percentage_of_population',
# 'Economic_Activity_Economically_active_Self_employed_with_employees_Part_time_percentage_of_population',
# 'Economic_Activity_Economically_active_Unemployed_percentage_of_population',
# 'Economic_Activity_Economically_active_Full_time_student_percentage_of_population',
'Economic_Activity_Economically_inactive_Total_percentage_of_population',
'Economic_Activity_Economically_inactive_Retired_percentage_of_population',
'Economic_Activity_Economically_inactive_Student_including_full_time_students_percentage_of_population',
'Economic_Activity_Economically_inactive_Long_term_sick_or_disabled_percentage_of_population',
# 'Qualification_All_categories_Highest_level_of_qualification_percentage_of_population',
'Qualification_No_qualifications_percentage_of_population',
# 'Distance_travelled_to_work_Less_than_2km_percentage_of_population',
# 'Method_of_Travel_to_Work_Underground_metro_light_rail_tram_percentage_of_population',
# 'Method_of_Travel_to_Work_Train_percentage_of_population',
# 'Method_of_Travel_to_Work_Bus_minibus_or_coach_percentage_of_population',
# 'Method_of_Travel_to_Work_Taxi_percentage_of_population',
# 'Method_of_Travel_to_Work_Motorcycle_scooter_or_moped_percentage_of_population',
'Method_of_Travel_to_Work_Driving_a_car_or_van_percentage_of_population',
'Method_of_Travel_to_Work_Passenger_in_a_car_or_van_percentage_of_population',
# 'Method_of_Travel_to_Work_Bicycle_percentage_of_population',
# 'Method_of_Travel_to_Work_On_foot_percentage_of_population',
# 'Sex_Males_percentage_of_population',
'Sex_Females_percentage_of_population',
# 'Proficiency_in_English_Main_language_is_English_English_or_Welsh_in_Wales_percentage_of_population',
# 'Religion_Christian_percentage_of_population',
'Religion_Buddhist_percentage_of_population',
'Religion_Hindu_percentage_of_population',
'Religion_Jewish_percentage_of_population',
'Religion_Muslim_percentage_of_population',
'Religion_No_religion_percentage_of_population',
# 'Hours_Worked_Part_time_15_hours_or_less_worked_percentage_of_population',
# 'Hours_Worked_Part_time_16_to_30_hours_worked_percentage_of_population',
# 'Hours_Worked_Full_time_31_to_48_hours_worked_percentage_of_population',
# 'Hours_Worked_Full_time_49_or_more_hours_worked_percentage_of_population',
# 'Cars_No_cars_or_vans_in_household_percentage_of_population',
'Cars_sum_of_All_cars_or_vans_in_the_area_percentage_of_population',
# 'Ethnic_Group_White_percentage_of_population',
'Ethnic_Group_Mixed_percentage_of_population',
'Ethnic_Group_Asian_percentage_of_population',
'Ethnic_Group_Black_percentage_of_population',
'Ethnic_Group_Other_percentage_of_population',
# 'Industry_A_Agriculture_forestry_and_fishing_percentage_of_popu,lation',
# 'Industry_B_Mining_and_quarrying_percentage_of_population',
# 'Industry_C_Manufacturing_percentage_of_population',
# 'Industry_D_Electricity_gas_steam_and_air_conditioning_supply_percentage_of_population',
# 'Industry_E_Water_supply_sewerage_waste_management_and_remediation_activities_percentage_of_population',
# 'Industry_F_Construction_percentage_of_population',
# 'Industry_G_Wholesale_and_retail_trade_repair_of_motor_vehicles_and_motor_cycles_percentage_of_population',
# 'Industry_H_Transport_and_storage_percentage_of_population',
# 'Industry_I_Accommodation_and_food_service_activities_percentage_of_population',
# 'Industry_J_Information_and_communication_percentage_of_population',
# 'Industry_K_Financial_and_insurance_activities_percentage_of_population',
# 'Industry_L_Real_estate_activities_percentage_of_population',
# 'Industry_M_Professional_scientific_and_technical_activities_percentage_of_population',
# 'Industry_N_Administrative_and_support_service_activities_percentage_of_population',
# 'Industry_O_Public_administration_and_defence_compulsory_social_security_percentage_of_population',
# 'Industry_P_Education_percentage_of_population',
# 'Industry_Q_Human_health_and_social_work_activities_percentage_of_population',
# 'Industry_R_S_Arts_entertainment_and_recreation_other_service_activities_percentage_of_population',
# 'Industry_T_Activities_of_households_as_employers_undifferentiated_goods_and_services_producing_activities_of_households_for_own_use_percentage_of_population',
# 'Industry_U_Activities_of_extraterritorial_organisations_and_bodies_percentage_of_population',
# 'General_Health_Very_good_health_percentage_of_population',
# 'General_Health_Good_health_percentage_of_population',
# 'General_Health_Fair_health_percentage_of_population',
'General_Health_Bad_health_percentage_of_population',
'General_Health_Very_bad_health_percentage_of_population',
# 'Age_0_to_17_percentage_of_population',
# 'Age_18_to_30_percentage_of_population',
# 'Age_30_40_percentage_of_population',
# 'Age_40_50_percentage_of_population',
# 'Age_50_60_percentage_of_population',
# 'Age_60_70_percentage_of_population',
# 'Age_70_80_percentage_of_population',
# 'Age_80_90_percentage_of_population',
# 'Age_90_100_percentage_of_population'
]
for col in negative_vector_columns:
df[col] = df[col].apply(lambda x: x * -1)
return df
# Normalize data using Z-scores
def normalize_data(df):
column_names = list(df)
# Get numeric columns to normalize
numeric_columns = [col for col in column_names if (col != 'geography_code' and
col != 'BrewDog' and
col != 'Starbucks' and
col != 'Binomial_Artificial_Data' and
col != 'Area_Population_Density_All_usual_residents_population_count' and
col != 'Area_Population_Density_Area_Hectares' and
col != 'Area_Population_Density_Density_number_of_persons_per_hectare')]
# Normalize each column
for col in numeric_columns:
col_zscore = col + '_z_score' # name of normalized column
#compute the zscore
df[col_zscore] = (df[col] - df[col].mean())/df[col].std(ddof=0)
del df[col] # delete the original column (not normalized)
return df
# Create composite age indicators (0-17, 18-30, 30-40, 40-50, 50-60, 60-70...)
def bucket_ages(df):
column_names = list(df)
del df['Age_Age_100_and_over_measures_Value'] # remove 100 and over
age_columns = [col for col in column_names if 'Age_Age_' in col]
## 0 to 17
age_0_to_17_columns = age_columns[0:17]
df['Age_0_to_17_population_count'] = df[age_0_to_17_columns].sum(axis=1)
for col in age_0_to_17_columns:
del df[col]
## 18 to 30
age_18_to_30_columns = age_columns[17:30]
df['Age_18_to_30_population_count'] = df[age_18_to_30_columns].sum(axis=1)
for col in age_18_to_30_columns:
del df[col]
## Rest of the age groups
base_age = 30 # comes from custom groupings above
for x in range(0,7):
low = base_age + (x * 10)
high = low + 10
age_group_columns = age_columns[low:high]
column_name = 'Age_' + str(low) + '_' + str(high) + '_population_count'
df[column_name] = df[age_group_columns].sum(axis=1)
for col in age_group_columns:
del df[col]
return df
# Create Column for Artificial Data by Binomial Distribution to show that correlations are valid
def generate_artifical_data(df):
number_of_brew_dogs = (df['BrewDog'] == 1).sum()
number_of_areas = len(df.index)
probability_of_brew_dog = float(number_of_brew_dogs) / float(number_of_areas)
df['Binomial_Artificial_Data'] = np.random.binomial(1, probability_of_brew_dog, number_of_areas)
return df
# Optionally person standardization, vectorization, and normalization on a dataframe
def transform_data(df_original, standardize, vectorize, normalize):
df = df_original.copy(deep=True)
column_names = list(df)
# Convert the columns to the correct datatype (for the UK all columns are numerical except geography_code so need to filter that out)
numeric_columns = [col for col in column_names if col != 'geography_code']
df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric)
# Create Age Buckets
df = bucket_ages(df)
# Rename columns with more descriptive name
df.columns = df.columns.str.replace('measures_Value', 'population_count')
column_names = list(df)
# Standardization
if standardize:
df = standardize_data(df)
# Artifical Data
df = generate_artifical_data(df)
# Move Columns to the front for correlation matrices
column_names = list(df)
column_names.insert(0, column_names.pop(column_names.index('Starbucks')))
column_names.insert(0, column_names.pop(column_names.index('Binomial_Artificial_Data')))
column_names.insert(0, column_names.pop(column_names.index('BrewDog')))
df = df.loc[:, column_names]
# Vectorization
if (vectorize):
df = vectorize_data(df)
# Normalization
if (normalize):
df = normalize_data(df)
return df
df_data_s = transform_data(df_data, True, False, False)
print '%d rows' % len(df_data_s)
df_data_s.describe()
df_data_st = transform_data(df_data, True, True, False)
print '%d rows' % len(df_data_st)
df_data_st.describe()
df_data_stn = transform_data(df_data, True, True, True)
print '%d rows' % len(df_data_st)
df_data_stn.describe()
In this section, we perform correlation analsysis on the data. We use this to eliminate redundant indicators as well as ensure that the BrewDog locations have some correlation with the indicators when compared to what is generated with the artifical data.
correlation_analysis_toggle = True
The below function will allow us to visualize our correlation matrices.
# Visualize correlation matrix
def plot_correlation(data, name):
min_value = 0
max_value = 0
for i in range(len(data.columns)):
for j in range(len(data.columns)):
if i != j:
min_value = min(min_value, data.iloc[i, j])
max_value = max(max_value, data.iloc[i, j])
span = max(abs(min_value), abs(max_value))
span = round(span + .05, 1)
items = data.columns.tolist()
ticks = np.arange(0.5, len(items) + 0.5)
plot.figure(figsize = (66, 42))
plot.pcolor(data.values, cmap = 'RdBu', vmin = -span, vmax = span)
plot.colorbar().set_label('correlation')
plot.xticks(ticks, items, rotation = 'vertical')
plot.yticks(ticks, items)
The below function will allow us to perform Spearman, Pearson, and Kendall-Tau correlation analysis all at once and visualize the results
def correlation_analysis(df):
if (correlation_analysis_toggle):
# correlate
spearman_correlation = df.corr(method='spearman')
pearson_correlation = df.corr(method='pearson')
kendall_correlation = df.corr(method='kendall')
# visualize
plot_correlation(spearman_correlation, 'spearman')
plot_correlation(pearson_correlation, 'pearson')
plot_correlation(kendall_correlation, 'kendall')
This section shows correlation matrices generated after each transformation of the data in order to ensure their validity.
correlation_analysis(df_data_s)
correlation_analysis(df_data_st)
correlation_analysis(df_data_stn)
In this section, we look at the correlations generated by filtering LSOAs on a particular density range. The density range is determined by the clustering of BrewDogs around a certain range. In this way, we can further highlight indicator correlations with LSOAs contianing BrewDogs.
# Given a dataframe, this function visualizes the population density distribution of LSOAs
def visualize_population_density(df):
df_fig, df_ax = plot.subplots()
df.hist(column='Area_Population_Density_Density_number_of_persons_per_hectare',
bins=175,
range=(0,350),
orientation='vertical',
ax=df_ax)
df_ax.set_xscale('log')
plot.title('LSOA Population Density Distribution')
plot.ylabel('Number of LSOAs')
plot.xlabel('Population Density (person per hectare)')
visualize_population_density(df_data_stn)
In this section we generate a dataframe of transformed data that only contains LSOAs with a BrewDog location.
# Filter dataframe to contain only LSOAs with BrewDog location
def filter_brewdog(df_original):
df = df_original.copy(deep=True)
return df.loc[df['BrewDog'] == 1]
df_data_stn_brewdog = filter_brewdog(df_data_stn)
df_data_stn_brewdog.describe()
visualize_population_density(df_data_stn_brewdog)
This section shows the population density distribution for the middle 50% of BrewDogs by population density of the LSOA they are contained in. It is this data that we will run through the correlation analysis to once again check that our indicators capture a signal corresponding to BrewDog location selection
# This function filters a dataframe on a population density range
def filter_density(df_original, minimum, maximum):
df = df_original.copy(deep=True)
return df.loc[(df['Area_Population_Density_Density_number_of_persons_per_hectare'] >= minimum) & (df['Area_Population_Density_Density_number_of_persons_per_hectare'] <= maximum)]
# choose density range currently showing middle 50%
min_density = 36.64 # 25%
max_density = 44.84 # 75%
df_data_stn_brewdog_50 = filter_density(df_data_stn_brewdog, min_density, max_density)
df_data_stn_brewdog_50.describe()
df_data_stn_50 = filter_density(df_data_stn, min_density, max_density)
df_data_stn_50.describe()
visualize_population_density(df_data_stn_brewdog_50)
visualize_population_density(df_data_stn_50)
correlation_analysis(df_data_stn_50)
In this section we remove indicators that are highly correlated. By removing highly correlated indicators we do not lose much in terms of signal but help to reduce the complexity of our model.
# This function takes a dataframe and and a threshold and filters columns that have
# a correlation that exceeds the threshold.
def correlation_filtration(df_original, threshold):
df = df_original.copy(deep=True)
# Set of all the names of deleted columns
cols_to_delete = set()
# filtration
correlation_matrix = df.corr()
for i in range(len(correlation_matrix.columns)):
for j in range(i):
if correlation_matrix.iloc[i, j] >= threshold:
column_name = correlation_matrix.columns[i]
cols_to_delete.add(column_name)
if column_name in df.columns:
del df[column_name]
return list(cols_to_delete)
list_columns_to_delete = correlation_filtration(df_data_stn, 0.9)
print list_columns_to_delete
def filter_columns(df_original, columns):
df = df_original.copy(deep=True)
return df.drop(columns=columns)
df_data_stn_corr = filter_columns(df_data_stn, list_columns_to_delete)
df_data_stn_corr.describe()
correlation_analysis(df_data_stn_corr)
df_data_stn_brewdog_corr = filter_columns(df_data_stn_brewdog, list_columns_to_delete)
df_data_stn_brewdog_corr.describe()
This section performs factor analysis using Principal Component Analysis to generate our composite measure.
This section performs PCA on our filtered BrewDog location dataset
# Create PCA Dataframe
def create_pca_dataframe(df_original, features):
df = df_original.copy(deep=True)
return pd.DataFrame(df, columns=features)
# Remove non-features to make features list
features = list(df_data_stn_brewdog_corr)
features.remove('Binomial_Artificial_Data')
features.remove('BrewDog')
features.remove('geography_code')
features.remove('Area_Population_Density_All_usual_residents_population_count')
features.remove('Area_Population_Density_Area_Hectares')
features.remove('Area_Population_Density_Density_number_of_persons_per_hectare')
df_pca = create_pca_dataframe(df_data_stn_brewdog_corr, features)
df_pca.describe()
# Separating out the features
x = df_data_stn_brewdog_corr.loc[:, features].values
# Separating out the target
target = 'BrewDog'
y = df_data_stn_brewdog_corr.loc[:, [target]].values
n_components = 10
pca = PCA(n_components=n_components)
pca.fit(x)
# Plot the variance explained by each component
def visualize_variance(pca):
variance = pca.explained_variance_ratio_
plot.plot(variance)
plot.title('PCA Ratio of Variance Explained by Components')
plot.xlabel('number of components')
plot.ylabel('ratio of variance explained')
visualize_variance(pca)
# Plot the cumalitve vriance explained by the components
def visualize_cumulative_variance(pca):
variance = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(np.round(variance, decimals=4) * 100)
plot.plot(cumulative_variance)
plot.title('PCA Cumulative Variance Explained by Components')
plot.xlabel('number of components')
plot.ylabel('cumulative explained variance')
visualize_cumulative_variance(pca)
In this section we generate the factor loadings from the PCA components using varimax rotation.
# Run PCA with the desired number of components
factors_to_generate = 5
factor_pca = PCA(n_components=factors_to_generate)
factor_pca.fit(x)
# Create Dataframe of factor loadings using a given number of componenents
def generate_factor_loadings(data, pca):
factors = ['Factor {}'.format(i) for i in range(1,len(pca.components_)+1)]
components = pd.DataFrame(np.round(pca.components_, 4), columns = data.keys())
components.index = factors
ratios = pca.explained_variance_ratio_.reshape(len(pca.components_), 1)
variance_ratios = pd.DataFrame(np.round(ratios, 4), columns = ['Explained Variance'])
variance_ratios.index = factors
return pd.concat([variance_ratios, components], axis = 1).transpose()
factor_loadings = generate_factor_loadings(df_pca, factor_pca)
factor_loadings.head()
In this section we generate the squared factor loadings scaled to a unity sum. The result is a list of individual indicators with relative weightings.
def sqaure_factors(df):
factor_names = list(df)
# Sqaure the entries in each column
for factor in factor_names:
df[factor] = (df[factor] * df[factor])
return df
def unit_scale_factors(df):
factor_names = list(df)
# Scale to sum to 1
for factor in factor_names:
df[factor] = (df[factor] / df[factor].mean()) / len(list(df[factor]))
return df
# Sqaure factor loadings and scale to unity sum
def square_scale_factors(df_original):
df = df_original.copy(deep=True)
# remove explained variance
df = df.drop(['Explained Variance'])
# square and scale
df = sqaure_factors(df)
df = unit_scale_factors(df)
return df
factor_loadings_ss = square_scale_factors(factor_loadings)
In this section we use our squared and scaled factor loadings to generate our composite measure.
In this section we generate intermediate composite indicators. This is done by grouping the indicators with highest factor loadings into intermediate composite indicators.
# Determine the indicators to use from each factor by analyzing the sorted list of indicators
# by loading for each factor
def identify_highest_loadings(df_original, factor):
df = df_original.copy(deep=True)[factor]
df = df.sort_values(ascending=False)
return df
# Find the intermediate composite indicators for each factor
def find_intermediate_composite_indicators(df, loadings_per_factor):
factors = list(df)
highest_loadings_list = []
for factor in factors:
highest_loadings = identify_highest_loadings(df, factor)
highest_loadings = highest_loadings.iloc[0:loadings_per_factor]
highest_loadings_list.append(highest_loadings)
print factor
print '------------------'
print highest_loadings
print '------------------\n'
return highest_loadings_list
intermediate_composite_indicators_list = find_intermediate_composite_indicators(factor_loadings_ss, 10)
From the list of intermediate composite indicators, we can use the explained variance of the factors they were generated as well as the squared and scaled factor loadings to compute the weightings for the indicators in the composite measure.
# Given an intermediate indicator list and the factors dataframe compute the weightings for the final composite measure
def calculate_weightings(intermediate_indicator_list, factors_df):
# dictionary to track indicator weightings for composite measure
indicator_weightings_dict = {}
# iterate through factors and intermediate indicators calcualting weighting
for i in range(0, factors_to_generate):
# get the intermediate indicator
intermediate_indicator = intermediate_indicator_list[i]
# get the factor and its explained variance corresponding to the intermediate indicator
factor = 'Factor ' + str((i + 1))
explained_variance = factors_df[factor]['Explained Variance']
for indicator, loading in intermediate_indicator.iteritems():
# if the indicator is not already in the dictionary, add it
if indicator not in indicator_weightings_dict:
indicator_weightings_dict[indicator] = 0
# get the indicator from the dictionary
current_indicator_weight = indicator_weightings_dict[indicator]
# add the intermediate indicator weight (product of factor loading and explained variance) to the current weight
indicator_weightings_dict[indicator] = current_indicator_weight + (loading * explained_variance)
return indicator_weightings_dict
composite_measure_indicator_weightings = calculate_weightings(intermediate_composite_indicators_list, factor_loadings)
def visualize_weightings(weightings_dict):
print "{:<150} {}".format('Indicator', 'Weight')
print '----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------'
for indicator in weightings_dict:
print "{:<150}, {},".format(indicator, weightings_dict[indicator])
visualize_weightings(composite_measure_indicator_weightings)
In this section we use the indicator weightings to calculate the composite measure for each LSOA in the UK
# This function takes the UK master dataset and composite measure weightings to compute the a composite measure value for each LSOA
def calculate_composite_measure(df_original, weightings_dict):
df = df_original.copy(deep=True)
# initialize composite measure value to 0
df['Composite_Measure'] = 0
# calculate composite measure value
for indicator in weightings_dict:
weight = weightings_dict[indicator]
df['Composite_Measure'] = df['Composite_Measure'] + (df[indicator] * weight)
# Move composite measure to the front of dataframe
column_names = list(df)
column_names.insert(0, column_names.pop(column_names.index('Composite_Measure')))
df = df[column_names]
return df
df_data_stn_composite_measure = calculate_composite_measure(df_data_stn, composite_measure_indicator_weightings)
df_data_stn_composite_measure.describe()
In this section we use the composite measure to create a list of the top predicted LSOAs for a BrewDog location.
df_data_stn_composite_measure_sorted = df_data_stn_composite_measure.sort_values(['Composite_Measure'], ascending=False)
df_data_stn_composite_measure_sorted.head(50)
def write_cloud_storage(df_original):
df = df_original.copy(deep=True)
columns = ['geography_code', 'Composite_Measure']
df = df[columns]
project = Context.default().project_id
bucket_name = project + "-composite-measure"
bucket_path = 'gs://' + bucket_name
bucket_object_name = 'composite-measure-output.json'
bucket = storage.Bucket(bucket_name)
bucket.create()
bucket_object = bucket.object(bucket_object_name)
df_json = df.to_json(orient='index')
bucket_object.write_stream(df_json, 'text/plain')
write_cloud_storage(df_data_stn_composite_measure_sorted)
# Given a dataframe, this function visualizes the population density distribution of LSOAs
def visualize_composite_measure(df):
df_fig, df_ax = plot.subplots()
df.hist(column='Composite_Measure',
bins=175,
orientation='vertical',
ax=df_ax)
plot.title('LSOA Composite Measure Distribution')
plot.ylabel('Number of LSOAs')
plot.xlabel('Composite Measure Value')
visualize_composite_measure(df_data_stn_composite_measure_sorted)
df_data_stn_composite_measure_sorted.describe()